Initialisation

Set Up

Load Libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot) # For correlation plots
## corrplot 0.95 loaded
library(FactoMineR) # For PCA
library(factoextra) # For PCA plots
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(vegan) # For CCA
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-8
library(ggplot2)
library(rsample) # For sampling of data (split, analysis,..)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(tibble)
library(tidyr)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:randomForest':
## 
##     combine
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(factoextra)
library(caret)
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:vegan':
## 
##     tolerance
## 
## The following object is masked from 'package:purrr':
## 
##     lift

Clear environment

rm(list = ls())

Specify the FilePath and Read CSV

path <- "/Users/eamon/Desktop/MathToolsProject/R_Project/MathematicalTools/lizard.csv"

# Read a CSV file
data <- read.csv(path)

Column Names for Copying

str(data)
## 'data.frame':    740 obs. of  24 variables:
##  $ Species                        : chr  "Agama Agama" "Agama Agama" "Agama hispida" "Caimanops amphiboluroides" ...
##  $ Genus                          : chr  "Agama" "Agama" "Agama" "Caimanops" ...
##  $ Family                         : chr  "Agamidae" "Agamidae" "Agamidae" "Agamidae" ...
##  $ Population                     : chr  "Suacocco, Liberia 1952-1953" "Ibadan, NIgeria" "Kalahari Desert, Africa, 1969-1970" "Australia" ...
##  $ Longitude                      : num  -9.05 3.92 21.25 119.1 119.62 ...
##  $ Latitude                       : num  7.08 7.4 -27.03 -28.5 -27.08 ...
##  $ Average.Female.adult.weight..g.: num  NA NA 31.9 NA 10 ...
##  $ SD.Female.adult.weight..g.     : chr  NA NA "6.16" NA ...
##  $ Sample.Size.Female.adult.weight: int  NA NA 28 NA 17 75 NA 41 741 120 ...
##  $ Mean.F.SVL.adults..mm.         : num  85 97 94.8 94 62.9 ...
##  $ SD.F.SVL.adults..mm.           : num  NA NA 6.91 7.07 4.43 ...
##  $ Sample.size.Mean.F.SVL.adults  : int  63 NA 51 2 18 75 NA 41 756 120 ...
##  $ F.SVL.at.maturity..mm.         : num  75 NA 76 89 57 44 48 49 47 51 ...
##  $ Offspring.SVL..mm.             : num  32 NA 29 38 22 23 24 25 22 26 ...
##  $ Mean.Clutch.Size               : num  7.5 5.5 13.4 12 5.36 ...
##  $ Sample.size.Clutch.Size.       : int  NA NA 45 2 14 28 NA 13 40 39 ...
##  $ Mode.of.reproduction           : chr  "Oviparous" "Oviparous" "Oviparous" "Oviparous" ...
##  $ Clutches.per.year              : num  2 NA NA NA NA NA 2 NA NA NA ...
##  $ Clutch.Frequency               : num  2 2 1 1 1 1 2 1 2 2 ...
##  $ RCM                            : num  NA NA 0.305 NA 0.166 ...
##  $ Foraging.Mode                  : chr  "Sit and wait" "Sit and wait" "Sit and wait" "Sit and wait" ...
##  $ Distribution                   : chr  "Tropical" "Tropical" "Temperate" "Temperate" ...
##  $ Prefered.Habitat.Type          : chr  "Saxicolous" "Saxicolous" "Semi-arboreal" "Semi-arboreal" ...
##  $ Source                         : chr  "(Harris 1964)" "(Daniel 1960)" "(Pianka 1986)" "(Pianka 2013d)" ...

Comparison of Similar Variables

# Are clutch frequency and clutches per year the same?  
cor(data$Clutch.Frequency, data$Clutches.per.year, use = "complete.obs")
## [1] 0.7994683
plot(data$Clutch.Frequency~data$Clutches.per.year)

unique(data$Clutch.Frequency)
## [1] 2.00 1.00   NA 0.50 0.75
unique(data$Clutches.per.year)
##  [1] 2.00   NA 3.00 1.00 4.00 6.00 0.50 0.75 2.50 1.50 1.30 3.50 5.00 5.50
# Are "F.SVL.at.maturity..mm." and "Mean.F.SVL.adults..mm." the same?
cor(data$F.SVL.at.maturity..mm., data$Mean.F.SVL.adults..mm., use="complete.obs")
## [1] 0.974189
plot(data$F.SVL.at.maturity..mm.~data$Mean.F.SVL.adults..mm.)

Seems like Clutch Frequency and Clutches per Year are highly related, but Clutches Per Year goes up to 6 and Clutch Frequency only ranges between 1 and 2. Seems like “F.SVL.at.maturity..mm.” and “Mean.F.SVL.adults..mm.” are the same. Therefore one of each pair should be excluded.

##Initialise Settings of the Analysis

# These variables are kept as required to answer the research questions.
# This will result in the "data_filtered" and "data_clean" dataframes.
variables_to_keep <- c(
  ### Categorical Variables
  #"Species",
  #"Genus",
  "Family",
  #"Population",
  "Mode.of.reproduction",
  #"Source",
  "Distribution",
  "Prefered.Habitat.Type",
  "Foraging.mode",
  
  ### Location
  #"Longitude",
  #"Latitude",
  
  ### Numerical Variables
  "Average.Female.adult.weight..g.",
  "Mean.F.SVL.adults..mm.",
  #"F.SVL.at.maturity..mm.", # Removed as "Mean.F.SVL.adults..mm." is almost identical.
  "Offspring.SVL..mm.",
  "Mean.Clutch.Size",
  #"Clutches.per.year",
  "Clutch.Frequency",
  "RCM"
  
  ### Standard Deviations and Sample Sizes
  #"SD.F.SVL.adults..mm.",
  #"Sample.size.Mean.F.SVL.adults",
  #"SD.Female.adult.weight..g.",
  #"Sample.Size.Female.adult.weight"
)

# Specify the variables to be included in the PCA
# This will be result in the "data_scaled" dataframe

variables_to_use_PCA <- c(
  ### Location
  #"Longitude",
  #"Latitude",
  
  ### Means and Numerical
  "Average.Female.adult.weight..g.",
  "Mean.F.SVL.adults..mm.",
  #"F.SVL.at.maturity..mm.",
  "Offspring.SVL..mm.",
  "Mean.Clutch.Size",
  "Clutches.per.year",
  #"Clutch.Frequency",
  "RCM"
  
  ### Standard Deviations and Sample Sizes
  #"SD.Female.adult.weight..g.",
  #"SD.F.SVL.adults..mm.",
  #"Sample.Size.Female.adult.weight",
  #"Sample.size.Mean.F.SVL.adults",
)

# Specify the Variables to Use in the Random Forests Analysis
variables_to_use_RF <- c(
  ### Categorical Variables
  #"Species",
  #"Genus",
  #"Family",
  #"Population",
  "Mode.of.reproduction",
  #"Source",
  "Distribution",
  "Prefered.Habitat.Type",
  "Foraging.Mode",
  "RCM",
  
  ### Location
  #"Longitude",
  #"Latitude",
  
  ### Numerical Variables
  "Average.Female.adult.weight..g.",
  "Mean.F.SVL.adults..mm.",
  "F.SVL.at.maturity..mm.",
  "Offspring.SVL..mm.",
  "Mean.Clutch.Size",
  "Clutches.per.year",
  "Clutch.Frequency",
  "RCM"
  
  ### Standard Deviations and Sample Sizes
  #"SD.F.SVL.adults..mm.",
  #"Sample.size.Mean.F.SVL.adults",
  #"SD.Female.adult.weight..g.",
  #"Sample.Size.Female.adult.weight"
)

variables_to_use_CCA <- c(
  #"Distribution",
  #"Foraging.Mode",
  #"Mode.of.reproduction",
  "Prefered.Habitat.Type",
  "Family",
  "Latitude"
)

Cleaning

Change Variable Types and Set Blanks as NA

# Set numeric variables as numeric
data$SD.Female.adult.weight..g. <- as.numeric(data$SD.Female.adult.weight..g.)
## Warning: NAs introduced by coercion
str(data)
## 'data.frame':    740 obs. of  24 variables:
##  $ Species                        : chr  "Agama Agama" "Agama Agama" "Agama hispida" "Caimanops amphiboluroides" ...
##  $ Genus                          : chr  "Agama" "Agama" "Agama" "Caimanops" ...
##  $ Family                         : chr  "Agamidae" "Agamidae" "Agamidae" "Agamidae" ...
##  $ Population                     : chr  "Suacocco, Liberia 1952-1953" "Ibadan, NIgeria" "Kalahari Desert, Africa, 1969-1970" "Australia" ...
##  $ Longitude                      : num  -9.05 3.92 21.25 119.1 119.62 ...
##  $ Latitude                       : num  7.08 7.4 -27.03 -28.5 -27.08 ...
##  $ Average.Female.adult.weight..g.: num  NA NA 31.9 NA 10 ...
##  $ SD.Female.adult.weight..g.     : num  NA NA 6.16 NA 2.29 ...
##  $ Sample.Size.Female.adult.weight: int  NA NA 28 NA 17 75 NA 41 741 120 ...
##  $ Mean.F.SVL.adults..mm.         : num  85 97 94.8 94 62.9 ...
##  $ SD.F.SVL.adults..mm.           : num  NA NA 6.91 7.07 4.43 ...
##  $ Sample.size.Mean.F.SVL.adults  : int  63 NA 51 2 18 75 NA 41 756 120 ...
##  $ F.SVL.at.maturity..mm.         : num  75 NA 76 89 57 44 48 49 47 51 ...
##  $ Offspring.SVL..mm.             : num  32 NA 29 38 22 23 24 25 22 26 ...
##  $ Mean.Clutch.Size               : num  7.5 5.5 13.4 12 5.36 ...
##  $ Sample.size.Clutch.Size.       : int  NA NA 45 2 14 28 NA 13 40 39 ...
##  $ Mode.of.reproduction           : chr  "Oviparous" "Oviparous" "Oviparous" "Oviparous" ...
##  $ Clutches.per.year              : num  2 NA NA NA NA NA 2 NA NA NA ...
##  $ Clutch.Frequency               : num  2 2 1 1 1 1 2 1 2 2 ...
##  $ RCM                            : num  NA NA 0.305 NA 0.166 ...
##  $ Foraging.Mode                  : chr  "Sit and wait" "Sit and wait" "Sit and wait" "Sit and wait" ...
##  $ Distribution                   : chr  "Tropical" "Tropical" "Temperate" "Temperate" ...
##  $ Prefered.Habitat.Type          : chr  "Saxicolous" "Saxicolous" "Semi-arboreal" "Semi-arboreal" ...
##  $ Source                         : chr  "(Harris 1964)" "(Daniel 1960)" "(Pianka 1986)" "(Pianka 2013d)" ...
# Convert all blank values of categorical variables to NA
data<-data %>%
  mutate(across(where(is.character), ~ na_if(.x, ""))) %>%
  mutate(across(where(is.character), as.factor))

str(data)
## 'data.frame':    740 obs. of  24 variables:
##  $ Species                        : Factor w/ 337 levels "Acratosaura mentalis",..: 2 2 3 50 84 85 86 86 87 87 ...
##  $ Genus                          : Factor w/ 137 levels "Acratosaura",..: 2 2 2 14 32 32 32 32 32 32 ...
##  $ Family                         : Factor w/ 33 levels "Agamidae","Anguidae",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Population                     : Factor w/ 288 levels "Acre, 1996","Africa-A, 1969-1970",..: 242 107 121 33 33 33 159 33 34 35 ...
##  $ Longitude                      : num  -9.05 3.92 21.25 119.1 119.62 ...
##  $ Latitude                       : num  7.08 7.4 -27.03 -28.5 -27.08 ...
##  $ Average.Female.adult.weight..g.: num  NA NA 31.9 NA 10 ...
##  $ SD.Female.adult.weight..g.     : num  NA NA 6.16 NA 2.29 ...
##  $ Sample.Size.Female.adult.weight: int  NA NA 28 NA 17 75 NA 41 741 120 ...
##  $ Mean.F.SVL.adults..mm.         : num  85 97 94.8 94 62.9 ...
##  $ SD.F.SVL.adults..mm.           : num  NA NA 6.91 7.07 4.43 ...
##  $ Sample.size.Mean.F.SVL.adults  : int  63 NA 51 2 18 75 NA 41 756 120 ...
##  $ F.SVL.at.maturity..mm.         : num  75 NA 76 89 57 44 48 49 47 51 ...
##  $ Offspring.SVL..mm.             : num  32 NA 29 38 22 23 24 25 22 26 ...
##  $ Mean.Clutch.Size               : num  7.5 5.5 13.4 12 5.36 ...
##  $ Sample.size.Clutch.Size.       : int  NA NA 45 2 14 28 NA 13 40 39 ...
##  $ Mode.of.reproduction           : Factor w/ 2 levels "Oviparous","Viviparous": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Clutches.per.year              : num  2 NA NA NA NA NA 2 NA NA NA ...
##  $ Clutch.Frequency               : num  2 2 1 1 1 1 2 1 2 2 ...
##  $ RCM                            : num  NA NA 0.305 NA 0.166 ...
##  $ Foraging.Mode                  : Factor w/ 2 levels "Active","Sit and wait": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Distribution                   : Factor w/ 2 levels "Temperate","Tropical": 2 2 1 1 1 1 1 1 1 1 ...
##  $ Prefered.Habitat.Type          : Factor w/ 8 levels "Aquatic","Arboreal",..: 6 6 7 7 6 8 8 8 8 8 ...
##  $ Source                         : Factor w/ 236 levels "(Alcala and Brown 1967)",..: 73 37 121 130 236 129 33 129 119 119 ...

Check for and Remove Duplicates

# Check for Duplicates
duplicates <- data[duplicated(data), ]
print("Duplicate Rows:")
## [1] "Duplicate Rows:"
print(duplicates)
##                     Species        Genus          Family
## 307      Sceloporus jarrovi   Sceloporus Phrynosomatidae
## 558  Aspidoscelis exsanguis Aspidoscelis         Teiidae
## 567 Aspidoscelis tesselatus Aspidoscelis         Teiidae
## 712        Xantusia vigilis     Xantusia     Xantusiidae
## 739                    <NA>         <NA>            <NA>
## 740                    <NA>         <NA>            <NA>
##                         Population Longitude Latitude
## 307                 USA, 1962-1964   -109.58    31.00
## 558                            USA   -104.00    30.00
## 567                            USA   -104.00    30.00
## 712 Antelope Valley, Mojave Desert   -118.25    34.75
## 739                           <NA>        NA       NA
## 740                           <NA>        NA       NA
##     Average.Female.adult.weight..g. SD.Female.adult.weight..g.
## 307                           15.53                         NA
## 558                           12.65                         NA
## 567                           17.95                         NA
## 712                              NA                         NA
## 739                              NA                         NA
## 740                              NA                         NA
##     Sample.Size.Female.adult.weight Mean.F.SVL.adults..mm. SD.F.SVL.adults..mm.
## 307                              NA                  77.90                   NA
## 558                              NA                  74.99                   NA
## 567                              NA                  82.70                   NA
## 712                              NA                  41.00                   NA
## 739                              NA                     NA                   NA
## 740                              NA                     NA                   NA
##     Sample.size.Mean.F.SVL.adults F.SVL.at.maturity..mm. Offspring.SVL..mm.
## 307                            NA                     69                 NA
## 558                            NA                     63                 NA
## 567                            NA                     66                 NA
## 712                            NA                     NA                 NA
## 739                            NA                     NA                 NA
## 740                            NA                     NA                 NA
##     Mean.Clutch.Size Sample.size.Clutch.Size. Mode.of.reproduction
## 307            6.750                       NA           Viviparous
## 558            2.868                       92            Oviparous
## 567            3.240                       55            Oviparous
## 712            1.790                       NA           Viviparous
## 739               NA                       NA                 <NA>
## 740               NA                       NA                 <NA>
##     Clutches.per.year Clutch.Frequency    RCM Foraging.Mode Distribution
## 307                NA                1 0.1502  Sit and wait    Temperate
## 558                NA                1 0.1325        Active    Temperate
## 567                NA                1 0.1261        Active    Temperate
## 712                 1                1     NA  Sit and wait    Temperate
## 739                NA               NA     NA          <NA>         <NA>
## 740                NA               NA     NA          <NA>         <NA>
##     Prefered.Habitat.Type              Source
## 307            Saxicolous       (Tinkle 1973)
## 558           Terrestrial       (Schall 1978)
## 567           Terrestrial       (Schall 1978)
## 712           Terrestrial (Miller 1951, 1954)
## 739                  <NA>                <NA>
## 740                  <NA>                <NA>
# Remove duplicates
data <- data[!duplicated(data), ]

Calculate Percentage of Each Variable Which Are NA

# Replace empty cells with NA for factors
data[data == ''] <- NA

# Calculate percentage of NA values for each column
na_percentage <- colSums(is.na(data)) / nrow(data) * 100

# Convert NA percentages to a data frame
na_df <- data.frame(
  Column = names(na_percentage),
  NA_Percentage = na_percentage
)

# Create bar plot
ggplot(na_df, aes(x = Column, y = NA_Percentage)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(
    title = "Percentage of NA Values per Column",
    x = "Columns",
    y = "Percentage (%)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1) # Rotate x-axis labels
  )

Remove Unneeded Variables

Here we removed the variables that we didn’t plan to use in any part of the analysis. This was done before filtering out NAs to avoid unneccessary data loss. All rows of the RCM column which are NA are also NA for mean female adult weight (which makes sense as they’re related).

# Select only the desired variables
data_filtered <- data %>%
  select(any_of(variables_to_keep))

NAs

Check Amount of Data Loss If All NAs Removed

data_clean <- na.omit(data_filtered)
nrow(data)
## [1] 734
# Too many rows lost if all NAs removed
nrow(data_clean)
## [1] 230

PCA

Create New Dataset retaining only the PCA variables

# Create new dataset by selecting only variables to use and removing rows with NAs
data_PCA <- data %>%
  select(any_of(variables_to_use_PCA)) %>%
  na.omit()

# Create ancillary dataset with all original variables but filtered to keep only rows present in the PCA dataset
# This will be used for coloration of the PCA graphs
data_PCA_all_vars <- data %>%
  filter(row_number() %in% rownames(data_PCA))

Scale the Data

data_PCA <- data_PCA %>% 
  #Only numeric columns are selected
  mutate_all(.funs = scale)

Visualise with a Correlation

### CorrPlot
data_PCA %>% 
  cor(use="pairwise.complete.obs") %>% # Calculate the empirical correlation matrix
  corrplot() # Then graph this matrix

Run the PCA

result_pca <- PCA(data_PCA, 
               scale.unit = TRUE, # Option to center and scale data (useless here)
               ncp = 18, # Number of components to keep (here, all)
               graph = FALSE)

Result of PCA

names(result_pca) # It's a list
## [1] "eig"  "var"  "ind"  "svd"  "call"
result_pca
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 155 individuals, described by 6 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

Percentage of Information Retained

fviz_eig(result_pca, choice = "variance")

### Correlation between Each Variable and the Principle Components

corrplot(result_pca$var$cor)

Visualise the PCA

# Representation in the first principal plane
fviz_pca_var(result_pca,
             axes = c(1, 2)) # Number of axes to represent 

fviz_pca_var(result_pca,
             axes = c(1, 3)) # Number of axes to represent 

fviz_pca_var(result_pca,
             axes = c(2, 3)) # Number of axes to represent 

fviz_pca_var(result_pca,
             axes = c(1, 4))

### Plot the Individuals with the Variable Arrows and the Categorical Coloring

Color_label <- "Distribution"

# Plot on Seperate Plots
fviz_pca_biplot(result_pca,
             axes = c(1, 2),
             col.ind = data_PCA_all_vars[[Color_label]])

fviz_pca_biplot(result_pca,
             axes = c(1, 3),
             col.ind = data_PCA_all_vars[[Color_label]])

fviz_pca_biplot(result_pca,
             axes = c(2, 3),
             col.ind = data_PCA_all_vars[[Color_label]])

fviz_pca_biplot(result_pca,
             axes = c(3, 4),
             col.ind = data_PCA_all_vars[[Color_label]])

### Plot all four on one plot
# Generate the plots
p1 <- fviz_pca_biplot(result_pca, axes = c(1, 2), col.ind = data_PCA_all_vars[[Color_label]])
p2 <- fviz_pca_biplot(result_pca, axes = c(1, 3), col.ind = data_PCA_all_vars[[Color_label]])
p3 <- fviz_pca_biplot(result_pca, axes = c(2, 3), col.ind = data_PCA_all_vars[[Color_label]])
p4 <- fviz_pca_biplot(result_pca, axes = c(3, 4), col.ind = data_PCA_all_vars[[Color_label]])

# Arrange the plots in a grid
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

K-Means Clustering

# Create Clusters For Four Groups
first_kmeans <- data_PCA %>% 
  kmeans(centers = 4)

# Within Group Inertia
first_kmeans$tot.withinss
## [1] 390.3572
kmeans_5groups<- data_PCA %>%
  kmeans(centers = 5)

kmeans_4groups<- data_PCA %>%
  kmeans(centers = 4)

kmeans_3groups<- data_PCA %>%
  kmeans(centers=3)

kmeans_2groups <- data_PCA %>%
  kmeans(centers=2)

# Extract the Clustering Groups
clusters_5groups <- kmeans_5groups$cluster
clusters_4groups <- kmeans_4groups$cluster
clusters_3groups <- kmeans_3groups$cluster
clusters_2groups <- kmeans_2groups$cluster

Plot Clusters

# Representation in the first principal plane
fviz_pca_biplot(result_pca,
             axes = c(1, 2),
             col.ind = clusters_2groups)

fviz_pca_biplot(result_pca,
             axes = c(1, 3),
             col.ind = clusters_2groups)

fviz_pca_biplot(result_pca,
             axes = c(2, 3),
             col.ind = clusters_2groups)

fviz_pca_biplot(result_pca,
             axes = c(3, 4),
             col.ind = clusters_2groups)

Comparison of 2 Groups and Distribution

# Save each of the plots 
p1A <- fviz_pca_biplot(result_pca, axes = c(1, 2), col.ind = as.factor(clusters_2groups), col.var = "black", title = "Coloration from K-Means Clustering")
p1B <- fviz_pca_biplot(result_pca, axes = c(1, 2), col.ind = data_PCA_all_vars[[Color_label]], col.var = "black", title = "Coloration from Distribution")
p2A <- fviz_pca_biplot(result_pca, axes = c(1, 3), col.ind = as.factor(clusters_2groups), col.var = "black", title = "Coloration from K-Means Clustering")
p2B <- fviz_pca_biplot(result_pca, axes = c(1, 3), col.ind = data_PCA_all_vars[[Color_label]], col.var = "black", title = "Coloration from Distribution")
p3A <- fviz_pca_biplot(result_pca, axes = c(2, 3), col.ind = as.factor(clusters_2groups), col.var = "black", title = "Coloration from K-Means Clustering")
p3B <- fviz_pca_biplot(result_pca, axes = c(2, 3), col.ind = data_PCA_all_vars[[Color_label]], col.var = "black", title = "Coloration from Distribution")
p4A <- fviz_pca_biplot(result_pca, axes = c(3, 4), col.ind = as.factor(clusters_2groups), col.var = "black", title = "Coloration from K-Means Clustering")
p4B <- fviz_pca_biplot(result_pca, axes = c(3, 4), col.ind = data_PCA_all_vars[[Color_label]], col.var = "black", title = "Coloration from Distribution")

# Plot each axis combination
grid.arrange(p1A, p1B, nrow = 1, ncol = 2)

grid.arrange(p2A, p2B, nrow = 1, ncol = 2)

grid.arrange(p3A, p3B, nrow = 1, ncol = 2)

grid.arrange(p4A, p4B, nrow = 1, ncol = 2)

Correspondance Analysis

Her version

# Read data
data_counts <- read.table("donnees_comptage_genre.txt",
                      sep = ";", 
                      header = TRUE, 
                      row.names = 1)
# Convert to long tibble
data_counts_long <- data_counts %>% 
  rownames_to_column(var = "Zone") %>% # Create Zone column from row names
  # Now we transform into length by aggregating all columns except zone into
  # 2 columns, one giving the Genus name, one giving the number of individuals
  pivot_longer(cols = -c("Zone"), # Variable not affected by aggregation
               names_to = "Genus", # Old column names will be 
               # brought into a "Genus" variable
               values_to = "NbIndividuals") # Numbers are aggregated in a new
# column NbIndiviuds
dim(data_counts) # Old dimensions
## [1] 9 8
data_counts_long
## # A tibble: 72 × 3
##    Zone       Genus         NbIndividuals
##    <chr>      <chr>                 <int>
##  1 Alluvial_1 Dipterocarpus            85
##  2 Alluvial_1 Eusideroxylon           204
##  3 Alluvial_1 Garcinia                  5
##  4 Alluvial_1 Gluta                   114
##  5 Alluvial_1 Hydnocarpus             197
##  6 Alluvial_1 Pentace                   8
##  7 Alluvial_1 Shorea                  342
##  8 Alluvial_1 Tristaniopsis             0
##  9 Alluvial_2 Dipterocarpus           209
## 10 Alluvial_2 Eusideroxylon           165
## # ℹ 62 more rows

My version - Create Count of Each

# Select all four variables to be used in the CA and CCA, omit NAs
data_CCA <- data %>%
  # Select only the variables to use for the CA and CCA
  select(any_of(variables_to_use_CCA)) %>%
  # Exclude NAs
  na.omit() 

# Make a frequency table containing just 2 categorical variables for the CA
# With each pair of levels on a seperate row
freq_CA_long <- data_CCA %>%
  select(Family, Prefered.Habitat.Type) %>%
  table() %>%
  as_tibble %>%
  rename(Freq = "n",
         Habitat = "Prefered.Habitat.Type")

# Make a tibble with the count of each combination
freq_CA_short <- data_CCA %>%
  select(Prefered.Habitat.Type, Family) %>%
  count(Prefered.Habitat.Type, Family) %>%  
  pivot_wider(
    names_from = Family, 
    values_from = n,  
    values_fill = 0)

# Convert to dataframe and create rownames (for use creating average profile)
freq_CA_short_df <- freq_CA_short %>%
  as.data.frame() %>%  
  column_to_rownames(var = "Prefered.Habitat.Type")  

freq_CA_long
## # A tibble: 264 × 3
##    Family           Habitat  Freq
##    <chr>            <chr>   <int>
##  1 Agamidae         Aquatic     0
##  2 Anguidae         Aquatic     0
##  3 Anniellidae      Aquatic     0
##  4 Carphodactylidae Aquatic     0
##  5 Chamaeleonidae   Aquatic     0
##  6 Cordylidae       Aquatic     0
##  7 Corytophanidae   Aquatic     0
##  8 Crotaphytidae    Aquatic     0
##  9 Dactyloidae      Aquatic     1
## 10 Diplodactylidae  Aquatic     0
## # ℹ 254 more rows
freq_CA_short
## # A tibble: 8 × 34
##   Prefered.Habitat.Type Dactyloidae Gymnophthalmidae Shinisauridae Teiidae
##   <fct>                       <int>            <int>         <int>   <int>
## 1 Aquatic                         1                4             1       1
## 2 Arboreal                       25                0             0       3
## 3 Bromelicolous                   0                0             0       0
## 4 Fossorial                       0                2             0       0
## 5 Psammophilus                    0                0             0       0
## 6 Saxicolous                      0                0             0       0
## 7 Semi-arboreal                   0                0             0       0
## 8 Terrestrial                    14               42             0     120
## # ℹ 29 more variables: Agamidae <int>, Chamaeleonidae <int>,
## #   Corytophanidae <int>, Diplodactylidae <int>, Gekkonidae <int>,
## #   Hoplocercidae <int>, Leiosauridae <int>, Liolaemidae <int>,
## #   Opluridae <int>, Phrynosomatidae <int>, Phyllodactylidae <int>,
## #   Polychrotidae <int>, Scincidae <int>, Sphaerodactylidae <int>,
## #   Tropiduridae <int>, Varanidae <int>, Anguidae <int>, Anniellidae <int>,
## #   Lacertidae <int>, Cordylidae <int>, Crotaphytidae <int>, Iguanidae <int>, …
freq_CA_short_df
##               Dactyloidae Gymnophthalmidae Shinisauridae Teiidae Agamidae
## Aquatic                 1                4             1       1        0
## Arboreal               25                0             0       3        5
## Bromelicolous           0                0             0       0        0
## Fossorial               0                2             0       0        0
## Psammophilus            0                0             0       0        0
## Saxicolous              0                0             0       0        3
## Semi-arboreal           0                0             0       0        3
## Terrestrial            14               42             0     120       16
##               Chamaeleonidae Corytophanidae Diplodactylidae Gekkonidae
## Aquatic                    0              0               0          0
## Arboreal                   1              1               5         17
## Bromelicolous              0              0               0          0
## Fossorial                  0              0               0          0
## Psammophilus               1              0               0          0
## Saxicolous                 0              0               0          8
## Semi-arboreal              0              0               1          0
## Terrestrial                0              1               5         14
##               Hoplocercidae Leiosauridae Liolaemidae Opluridae Phrynosomatidae
## Aquatic                   0            0           0         0               0
## Arboreal                  1            3           6         1              15
## Bromelicolous             0            0           0         0               0
## Fossorial                 0            0           0         0               0
## Psammophilus              0            0           0         0               6
## Saxicolous                0            0           3         0              18
## Semi-arboreal             0            0           0         0               0
## Terrestrial               0            2          12         0              43
##               Phyllodactylidae Polychrotidae Scincidae Sphaerodactylidae
## Aquatic                      0             0         0                 0
## Arboreal                     5             4        16                 9
## Bromelicolous                0             0        10                 0
## Fossorial                    0             0         9                 0
## Psammophilus                 0             0         1                 0
## Saxicolous                  10             0         4                 0
## Semi-arboreal                0             0         0                 2
## Terrestrial                  8             0        80                 8
##               Tropiduridae Varanidae Anguidae Anniellidae Lacertidae Cordylidae
## Aquatic                  0         0        0           0          0          0
## Arboreal                15         1        0           0          0          0
## Bromelicolous            0         0        0           0          0          0
## Fossorial                0         0        2           3          0          0
## Psammophilus             1         0        0           0          1          0
## Saxicolous              33         0        1           0          3          5
## Semi-arboreal            0         0        0           0          0          0
## Terrestrial              8         2        5           0         24          0
##               Crotaphytidae Iguanidae Leiocephalidae Xenosauridae
## Aquatic                   0         0              0            0
## Arboreal                  0         0              0            0
## Bromelicolous             0         0              0            0
## Fossorial                 0         0              0            0
## Psammophilus              0         0              0            0
## Saxicolous                1         2              6           23
## Semi-arboreal             0         0              0            0
## Terrestrial              10         3              2            0
##               Carphodactylidae Eublepharidae Helodermatidae Pygopodidae
## Aquatic                      0             0              0           0
## Arboreal                     0             0              0           0
## Bromelicolous                0             0              0           0
## Fossorial                    0             0              0           0
## Psammophilus                 0             0              0           0
## Saxicolous                   0             0              0           0
## Semi-arboreal                0             0              0           0
## Terrestrial                  2             1              1           3
##               Xantusiidae
## Aquatic                 0
## Arboreal                0
## Bromelicolous           0
## Fossorial               0
## Psammophilus            0
## Saxicolous              0
## Semi-arboreal           0
## Terrestrial             5

Represent in ggplot

ggplot(freq_CA_long) +
  aes(x = Family, y = Habitat, fill = Freq) + # fill says which column is used 
  # for filling
  geom_raster() + # represents in raster mode
  # and dress to make it pretty
  scale_fill_viridis_c() + # Change color scale
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

### Calculate the Average Profile

# Create average profile  
average_profile <- colSums(freq_CA_short_df) / sum(freq_CA_short_df)
average_profile
##       Dactyloidae  Gymnophthalmidae     Shinisauridae           Teiidae 
##       0.054570259       0.065484311       0.001364256       0.169167804 
##          Agamidae    Chamaeleonidae    Corytophanidae   Diplodactylidae 
##       0.036834925       0.002728513       0.002728513       0.015006821 
##        Gekkonidae     Hoplocercidae      Leiosauridae       Liolaemidae 
##       0.053206003       0.001364256       0.006821282       0.028649386 
##         Opluridae   Phrynosomatidae  Phyllodactylidae     Polychrotidae 
##       0.001364256       0.111869031       0.031377899       0.005457026 
##         Scincidae Sphaerodactylidae      Tropiduridae         Varanidae 
##       0.163710778       0.025920873       0.077762619       0.004092769 
##          Anguidae       Anniellidae        Lacertidae        Cordylidae 
##       0.010914052       0.004092769       0.038199181       0.006821282 
##     Crotaphytidae         Iguanidae    Leiocephalidae      Xenosauridae 
##       0.015006821       0.006821282       0.010914052       0.031377899 
##  Carphodactylidae     Eublepharidae    Helodermatidae       Pygopodidae 
##       0.002728513       0.001364256       0.001364256       0.004092769 
##       Xantusiidae 
##       0.006821282
# Create the Count  
count_habitat <- rowSums(freq_CA_short_df)
count_habitat
##       Aquatic      Arboreal Bromelicolous     Fossorial  Psammophilus 
##             7           133            10            16            10 
##    Saxicolous Semi-arboreal   Terrestrial 
##           120             6           431
# Division Counts
CA_repartition <- (freq_CA_short_df / count_habitat) %>% 
  rbind(Average_profile = average_profile)
CA_repartition
##                 Dactyloidae Gymnophthalmidae Shinisauridae    Teiidae
## Aquatic          0.14285714       0.57142857   0.142857143 0.14285714
## Arboreal         0.18796992       0.00000000   0.000000000 0.02255639
## Bromelicolous    0.00000000       0.00000000   0.000000000 0.00000000
## Fossorial        0.00000000       0.12500000   0.000000000 0.00000000
## Psammophilus     0.00000000       0.00000000   0.000000000 0.00000000
## Saxicolous       0.00000000       0.00000000   0.000000000 0.00000000
## Semi-arboreal    0.00000000       0.00000000   0.000000000 0.00000000
## Terrestrial      0.03248260       0.09744780   0.000000000 0.27842227
## Average_profile  0.05457026       0.06548431   0.001364256 0.16916780
##                   Agamidae Chamaeleonidae Corytophanidae Diplodactylidae
## Aquatic         0.00000000    0.000000000    0.000000000      0.00000000
## Arboreal        0.03759398    0.007518797    0.007518797      0.03759398
## Bromelicolous   0.00000000    0.000000000    0.000000000      0.00000000
## Fossorial       0.00000000    0.000000000    0.000000000      0.00000000
## Psammophilus    0.00000000    0.100000000    0.000000000      0.00000000
## Saxicolous      0.02500000    0.000000000    0.000000000      0.00000000
## Semi-arboreal   0.50000000    0.000000000    0.000000000      0.16666667
## Terrestrial     0.03712297    0.000000000    0.002320186      0.01160093
## Average_profile 0.03683492    0.002728513    0.002728513      0.01500682
##                 Gekkonidae Hoplocercidae Leiosauridae Liolaemidae   Opluridae
## Aquatic         0.00000000   0.000000000  0.000000000  0.00000000 0.000000000
## Arboreal        0.12781955   0.007518797  0.022556391  0.04511278 0.007518797
## Bromelicolous   0.00000000   0.000000000  0.000000000  0.00000000 0.000000000
## Fossorial       0.00000000   0.000000000  0.000000000  0.00000000 0.000000000
## Psammophilus    0.00000000   0.000000000  0.000000000  0.00000000 0.000000000
## Saxicolous      0.06666667   0.000000000  0.000000000  0.02500000 0.000000000
## Semi-arboreal   0.00000000   0.000000000  0.000000000  0.00000000 0.000000000
## Terrestrial     0.03248260   0.000000000  0.004640371  0.02784223 0.000000000
## Average_profile 0.05320600   0.001364256  0.006821282  0.02864939 0.001364256
##                 Phrynosomatidae Phyllodactylidae Polychrotidae  Scincidae
## Aquatic              0.00000000       0.00000000   0.000000000 0.00000000
## Arboreal             0.11278195       0.03759398   0.030075188 0.12030075
## Bromelicolous        0.00000000       0.00000000   0.000000000 1.00000000
## Fossorial            0.00000000       0.00000000   0.000000000 0.56250000
## Psammophilus         0.60000000       0.00000000   0.000000000 0.10000000
## Saxicolous           0.15000000       0.08333333   0.000000000 0.03333333
## Semi-arboreal        0.00000000       0.00000000   0.000000000 0.00000000
## Terrestrial          0.09976798       0.01856148   0.000000000 0.18561485
## Average_profile      0.11186903       0.03137790   0.005457026 0.16371078
##                 Sphaerodactylidae Tropiduridae   Varanidae    Anguidae
## Aquatic                0.00000000   0.00000000 0.000000000 0.000000000
## Arboreal               0.06766917   0.11278195 0.007518797 0.000000000
## Bromelicolous          0.00000000   0.00000000 0.000000000 0.000000000
## Fossorial              0.00000000   0.00000000 0.000000000 0.125000000
## Psammophilus           0.00000000   0.10000000 0.000000000 0.000000000
## Saxicolous             0.00000000   0.27500000 0.000000000 0.008333333
## Semi-arboreal          0.33333333   0.00000000 0.000000000 0.000000000
## Terrestrial            0.01856148   0.01856148 0.004640371 0.011600928
## Average_profile        0.02592087   0.07776262 0.004092769 0.010914052
##                 Anniellidae Lacertidae  Cordylidae Crotaphytidae   Iguanidae
## Aquatic         0.000000000 0.00000000 0.000000000   0.000000000 0.000000000
## Arboreal        0.000000000 0.00000000 0.000000000   0.000000000 0.000000000
## Bromelicolous   0.000000000 0.00000000 0.000000000   0.000000000 0.000000000
## Fossorial       0.187500000 0.00000000 0.000000000   0.000000000 0.000000000
## Psammophilus    0.000000000 0.10000000 0.000000000   0.000000000 0.000000000
## Saxicolous      0.000000000 0.02500000 0.041666667   0.008333333 0.016666667
## Semi-arboreal   0.000000000 0.00000000 0.000000000   0.000000000 0.000000000
## Terrestrial     0.000000000 0.05568445 0.000000000   0.023201856 0.006960557
## Average_profile 0.004092769 0.03819918 0.006821282   0.015006821 0.006821282
##                 Leiocephalidae Xenosauridae Carphodactylidae Eublepharidae
## Aquatic            0.000000000    0.0000000      0.000000000   0.000000000
## Arboreal           0.000000000    0.0000000      0.000000000   0.000000000
## Bromelicolous      0.000000000    0.0000000      0.000000000   0.000000000
## Fossorial          0.000000000    0.0000000      0.000000000   0.000000000
## Psammophilus       0.000000000    0.0000000      0.000000000   0.000000000
## Saxicolous         0.050000000    0.1916667      0.000000000   0.000000000
## Semi-arboreal      0.000000000    0.0000000      0.000000000   0.000000000
## Terrestrial        0.004640371    0.0000000      0.004640371   0.002320186
## Average_profile    0.010914052    0.0313779      0.002728513   0.001364256
##                 Helodermatidae Pygopodidae Xantusiidae
## Aquatic            0.000000000 0.000000000 0.000000000
## Arboreal           0.000000000 0.000000000 0.000000000
## Bromelicolous      0.000000000 0.000000000 0.000000000
## Fossorial          0.000000000 0.000000000 0.000000000
## Psammophilus       0.000000000 0.000000000 0.000000000
## Saxicolous         0.000000000 0.000000000 0.000000000
## Semi-arboreal      0.000000000 0.000000000 0.000000000
## Terrestrial        0.002320186 0.006960557 0.011600928
## Average_profile    0.001364256 0.004092769 0.006821282
ggplot(freq_CA_long) +
  aes(x = Family, y = Habitat, fill = Freq) + # fill says which column is used 
  # for filling
  geom_raster() + # represents in raster mode
  # and dress to make it pretty
  scale_fill_viridis_c() + # Change color scale
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

### CA

# 
unique(data$Family)
##  [1] Agamidae          Anguidae          Anniellidae       Carphodactylidae 
##  [5] Chamaeleonidae    Cordylidae        Corytophanidae    Crotaphytidae    
##  [9] Dactyloidae       Diplodactylidae   Eublepharidae     Gekkonidae       
## [13] Gymnophthalmidae  Helodermatidae    Hoplocercidae     Iguanidae        
## [17] Lacertidae        Leiocephalidae    Leiosauridae      Liolaemidae      
## [21] Opluridae         Phrynosomatidae   Phyllodactylidae  Polychrotidae    
## [25] Pygopodidae       Scincidae         Shinisauridae     Sphaerodactylidae
## [29] Teiidae           Tropiduridae      Varanidae         Xantusiidae      
## [33] Xenosauridae      <NA>             
## 33 Levels: Agamidae Anguidae Anniellidae Carphodactylidae ... Xenosauridae
result_CA <- CA(freq_CA_short_df, graph = FALSE)
fviz_eig(result_CA)

fviz_ca_biplot(result_CA, repel = TRUE)

Canonical Correspondance Analysis

Create Data

mean_latitude_df <- data_CCA %>%
  group_by(Prefered.Habitat.Type) %>%           
  summarise(mean_latitude_abs = mean(abs(Latitude), na.rm = TRUE)) %>% 
  as.data.frame()   

mean_latitude_df
##   Prefered.Habitat.Type mean_latitude_abs
## 1               Aquatic           9.64369
## 2              Arboreal          15.16887
## 3         Bromelicolous          18.77975
## 4             Fossorial          28.29688
## 5          Psammophilus          29.26900
## 6            Saxicolous          18.32790
## 7         Semi-arboreal          21.67167
## 8           Terrestrial          20.77065

Run CCA

# Make a frequency table containing all 4 categorical variables to be used
result_CCA <- cca(freq_CA_short_df ~ mean_latitude_abs,
                    data = mean_latitude_df)
plot(result_CCA)

scores_CCA = scores(result_CCA)
ordiArrowMul(scores_CCA$biplot)
## [1] 3.722339
anova(result_CCA)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = freq_CA_short_df ~ mean_latitude_abs, data = mean_latitude_df)
##          Df ChiSquare      F Pr(>F)
## Model     1   0.23944 1.2409  0.788
## Residual  6   1.15773
anova(result_CCA,by="margin")
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = freq_CA_short_df ~ mean_latitude_abs, data = mean_latitude_df)
##                   Df ChiSquare      F Pr(>F)
## mean_latitude_abs  1   0.23944 1.2409  0.753
## Residual           6   1.15773

Unsupervised Learning

##Split Data ###Filter Data Variables

data_RF <- data_clean %>%
  select(any_of(variables_to_use_RF))
str(data_RF)
## 'data.frame':    230 obs. of  9 variables:
##  $ Mode.of.reproduction           : Factor w/ 2 levels "Oviparous","Viviparous": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Distribution                   : Factor w/ 2 levels "Temperate","Tropical": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Prefered.Habitat.Type          : Factor w/ 8 levels "Aquatic","Arboreal",..: 7 6 8 8 8 8 8 8 8 8 ...
##  $ RCM                            : num  0.305 0.166 0.192 0.114 0.143 ...
##  $ Average.Female.adult.weight..g.: num  31.86 10.04 3.58 4.74 6.78 ...
##  $ Mean.F.SVL.adults..mm.         : num  94.8 62.9 48 52.6 56 ...
##  $ Offspring.SVL..mm.             : num  29 22 23 25 22 26 24 22 25 31 ...
##  $ Mean.Clutch.Size               : num  13.4 5.36 2.71 2.31 3.4 ...
##  $ Clutch.Frequency               : num  1 1 1 1 2 2 2 2 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:504] 1 2 4 7 11 12 15 17 21 22 ...
##   ..- attr(*, "names")= chr [1:504] "1" "2" "4" "7" ...

Initial Split

split_data <- data_RF %>%
  initial_split(prop = 0.8, strata = "Distribution")

# Train and Test Data
train_data <- analysis(split_data)
test_data <- assessment(split_data)

Random Forests

Create the RF

forest <- randomForest(Distribution ~ ., # Formula for prediction
                      data = train_data, # Data for training
                      ntree = 5000, # Number of trees
                      maxnodes = 10, # Number of maximum leaves for each tree
                      mtry = 4, # Number of variables for each tree
                      importance = TRUE) # Computation of importance

Test the Accuracy

# Fast way of computing accuracy with the pipe operator %>% 
predict(forest, newdata = select(test_data, - Distribution)) %>% 
  table(prediction = ., truth = test_data$Distribution) %>% 
  {sum(diag(.)) / sum(.)}
## [1] 0.7446809

Find the Variable Importance

varImpPlot(forest)

### Hyper Parameter Tuning

# Initialisation
params_CV <- trainControl(method = "repeatedcv", 
                          number = 10, 
                          repeats = 5)

predictor_forest <- caret::train(Distribution ~ ., 
                                 data = data_RF, 
                                 method = "rf", 
                                 metric = "Accuracy", 
                                 trControl = params_CV, 
                                 tuneGrid = expand.grid(mtry = 2:12))

as.data.frame(predictor_forest$results) %>% 
  ggplot(aes(x = mtry, y = Accuracy)) +
  geom_point()